{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import json\n",
    "from collections import defaultdict, Counter\n",
    "from pathlib import Path\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "\n",
    "raw_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/data/ccks2020_el_data_v1')\n",
    "pickle_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/data/pickle')\n",
    "csv_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/data/csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "entity_to_kbids = defaultdict(set)\n",
    "kbid_to_entities = dict()\n",
    "kbid_to_text = dict()\n",
    "kbid_to_types = dict()\n",
    "\n",
    "idx_to_type = []\n",
    "type_to_idx = dict()\n",
    "\n",
    "with open(raw_path/'kb.json', 'r') as f:\n",
    "    for i, line in enumerate(f):\n",
    "        temp = json.loads(line)\n",
    "        \n",
    "        kbid = temp['subject_id']\n",
    "        entities = set(temp['alias'])\n",
    "        entities.add(temp['subject'])\n",
    "        for entity in entities:\n",
    "            entity_to_kbids[entity].add(kbid)\n",
    "        kbid_to_entities[kbid] = entities\n",
    "        \n",
    "        data_list = []\n",
    "        for x in temp['data']:\n",
    "            data_list.append(':'.join([x['predicate'], x['object']]))\n",
    "        kbid_to_text[kbid] = ' '.join(data_list)\n",
    "        \n",
    "        type_list = temp['type'].split('|')\n",
    "        kbid_to_types[kbid] = type_list\n",
    "        for t in type_list:\n",
    "            if t not in type_to_idx:\n",
    "                type_to_idx[t] = len(idx_to_type)\n",
    "                idx_to_type.append(t)        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'10001': {'张健'},\n",
       " '10002': {'Sacred journey', '圣途'},\n",
       " '10003': {'李明'},\n",
       " '10004': {'陈臻'},\n",
       " '10005': {'承志', '爱新觉罗·承志'},\n",
       " '10006': {'COC'},\n",
       " '10007': {'王建平'},\n",
       " '10008': {'海盗'},\n",
       " '10009': {'原型'},\n",
       " '10010': {'光'},\n",
       " '10011': {'FAIRY TAIL : HOUOU NO MIKO', '妖精的尾巴《凤凰的巫女》', '妖精的尾巴剧场版'},\n",
       " '10012': {'天梯'},\n",
       " '10013': {'李志军'},\n",
       " '10014': {'周红斌'},\n",
       " '10015': {'散户'},\n",
       " '10016': {'李潇'},\n",
       " '10017': {'李鑫'},\n",
       " '10018': {'配角'},\n",
       " '10019': {'十八'},\n",
       " '10020': {'诸葛亮'},\n",
       " '10021': {'摆渡人'},\n",
       " '10022': {'杭州市城市土地发展有限公司'},\n",
       " '10023': {'Frozen', '冷若冰霜'},\n",
       " '10024': {'徐新建'},\n",
       " '10025': {'马壮'},\n",
       " '10026': {'Longquanyi District',\n",
       "  '成都龙泉',\n",
       "  '成都龙泉驿',\n",
       "  '成都龙泉驿区',\n",
       "  '龙泉',\n",
       "  '龙泉驿',\n",
       "  '龙泉驿区'},\n",
       " '10027': {'工程制图习题集'},\n",
       " '10028': {'萤火虫'},\n",
       " '10029': {'Gravity', '地心引力'},\n",
       " '10030': {'三生石'},\n",
       " '10031': {'aggregate', '集', '集合'},\n",
       " '10032': {'在那东山顶上'},\n",
       " '10033': {'郝梅'},\n",
       " '10034': {'刀剑如梦'},\n",
       " '10035': {'波斯语：جلال دین محمد اکبر拉丁化：Jalal din Muhammad Akbar',\n",
       "  '阿克巴',\n",
       "  '阿克巴(1542～1605)',\n",
       "  '阿克巴大帝',\n",
       "  '阿布·乌尔法特·贾拉尔丁·穆罕默德·阿克巴'},\n",
       " '10036': {'Meerkat, Suricate',\n",
       "  'Slender-tailed Meerkat',\n",
       "  'Suricata suricatta',\n",
       "  '沼狸',\n",
       "  '海岛猫鼬',\n",
       "  '灰沼狸',\n",
       "  '灰爪狸',\n",
       "  '狐獴',\n",
       "  '猫鼬',\n",
       "  '细尾獴'},\n",
       " '10037': {'遗产', '遗嘱'},\n",
       " '10038': {\"Yue Fei's Army\", '岳家军'},\n",
       " '10039': {'胡斌'},\n",
       " '10040': {'杨林'},\n",
       " '10041': {'SichuanProvinceFushun Chengguan Middle School',\n",
       "  '四川富顺城关中学',\n",
       "  '四川省富顺县城关中学',\n",
       "  '四川省自贡市富顺县城关中学',\n",
       "  '城关七中',\n",
       "  '城关中学',\n",
       "  '城关中学富顺学校',\n",
       "  '富顺七中',\n",
       "  '富顺县城关中学',\n",
       "  '富顺县城关中学校',\n",
       "  '富顺城关',\n",
       "  '富顺城关中学',\n",
       "  '富顺城关中学校'},\n",
       " '10042': {'我把最美的歌唱给妈妈', '最美的歌唱给妈妈'},\n",
       " '10043': {'王巍'},\n",
       " '10044': {'情人节'},\n",
       " '10045': {'凯'},\n",
       " '10046': {'Specters', '冥斗士'},\n",
       " '10047': {'刘英'},\n",
       " '10048': {'西游记'},\n",
       " '10049': {'李斯'},\n",
       " '10050': {'高晓东'},\n",
       " '10051': {'海市蜃楼'},\n",
       " '10052': {'Szép napok', '我那美好残酷的青春'},\n",
       " '10053': {'周瑾'},\n",
       " '10054': {'月月', '月月【非全名】'},\n",
       " '10055': {'南平', '南平镇'},\n",
       " '10056': {'人气美食'},\n",
       " '10057': {'六院'},\n",
       " '10058': {'Ibaraki-douji', '茨木童子'},\n",
       " '10059': {'压榨'},\n",
       " '10060': {'拖斗'},\n",
       " '10061': {'七级镇'},\n",
       " '10062': {'王小兰'},\n",
       " '10063': {'徐鹏'},\n",
       " '10064': {'Master of art', '绘画大师', '艺术大师'},\n",
       " '10065': {'王芳'},\n",
       " '10066': {'三国演义'},\n",
       " '10067': {'鲁山'},\n",
       " '10068': {'Guangzhou University', '广大', '广大大学', '广大（GZ）', '广州大学', '广州师范学院'},\n",
       " '10069': {'陈金荣'},\n",
       " '10070': {'LIU TAO', '刘涛'},\n",
       " '10071': {'tiny', '渺小'},\n",
       " '10072': {'51'},\n",
       " '10073': {'张明'},\n",
       " '10074': {'李国强'},\n",
       " '10075': {'You are a nice lady：but why...', '卿本佳人'},\n",
       " '10076': {'二货'},\n",
       " '10077': {'秘方'},\n",
       " '10078': {'群魔乱舞'},\n",
       " '10079': {'初二'},\n",
       " '10080': {'天台'},\n",
       " '10081': {'monopoly', '专卖'},\n",
       " '10082': {'rumours', '谣传'},\n",
       " '10083': {'液压与气动技术'},\n",
       " '10084': {'candy'},\n",
       " '10085': {'诸葛村'},\n",
       " '10086': {'末日风暴'},\n",
       " '10087': {'知道不知道'},\n",
       " '10088': {'英雄无泪'},\n",
       " '10089': {'落选'},\n",
       " '10090': {'Endowment insurance cost', '养老保险保费', '养老保险费', '养老费'},\n",
       " '10091': {'抓住'},\n",
       " '10092': {'晏几道', '晏小山'},\n",
       " '10093': {'梦间集'},\n",
       " '10094': {'尚文波'},\n",
       " '10095': {'FuchuanYao Autonomous County', '富川', '富川县', '富川瑶族自治县', '广西富川'},\n",
       " '10096': {'南山在北不在南', '风'},\n",
       " '10097': {'万岁'},\n",
       " '10098': {'创业时代'},\n",
       " '10099': {'小八'},\n",
       " '10100': {'许可'},\n",
       " '10101': {'《长大成人》', '长大成人'},\n",
       " '10102': {'蝶恋花'},\n",
       " '10103': {'张华'},\n",
       " '10104': {'李威'},\n",
       " '10105': {'黄藤'},\n",
       " '10106': {'计算机应用基础'},\n",
       " '10107': {'四季'},\n",
       " '10108': {'股神'},\n",
       " '10109': {'归'},\n",
       " '10110': {'人间真情'},\n",
       " '10111': {'冢原卜传', '塚原卜伝'},\n",
       " '10112': {'新兵日记'},\n",
       " '10113': {'梅'},\n",
       " '10114': {'柳兰芳'},\n",
       " '10115': {'Nurse practitioner', '护士师', '护理师'},\n",
       " '10116': {'KO-SO-KO-SO', 'Ko'},\n",
       " '10117': {'桃花源记'},\n",
       " '10118': {'小富'},\n",
       " '10119': {'刘晶波'},\n",
       " '10120': {'都'},\n",
       " '10121': {'张宇'},\n",
       " '10122': {'概率论与数理统计'},\n",
       " '10123': {'Lord Of Shanghai', '功夫', '枭雄'},\n",
       " '10124': {'JAZZ'},\n",
       " '10125': {'李睿'},\n",
       " '10126': {'cook/chef', '厨子', '厨师', '厨男', '后厨', '大厨', '川菜'},\n",
       " '10127': {'田源'},\n",
       " '10128': {'林翠萍'},\n",
       " '10129': {'中国创造'},\n",
       " '10130': {'Chongqing garden expo garden', '园博园', '园博园重庆', '重庆园博园'},\n",
       " '10131': {'Diagonal', '对角线'},\n",
       " '10132': {'死士'},\n",
       " '10133': {'情'},\n",
       " '10134': {'恰同学少年',\n",
       "  '沁园春-长沙',\n",
       "  '沁园春.长沙',\n",
       "  '沁园春·独立寒秋',\n",
       "  '沁园春·长沙',\n",
       "  '沁园春之长沙',\n",
       "  '沁园春长沙',\n",
       "  '沁春园长沙'},\n",
       " '10135': {'谢敏'},\n",
       " '10136': {'林嘉'},\n",
       " '10137': {'false accounting', '假账'},\n",
       " '10138': {'韩辉'},\n",
       " '10139': {'Journey to the west', '西游记'},\n",
       " '10140': {'张建军'},\n",
       " '10141': {'哑女', '桃花深处'},\n",
       " '10142': {'The Child', '小孩'},\n",
       " '10143': {'枫'},\n",
       " '10144': {'没有'},\n",
       " '10145': {'古玩大亨'},\n",
       " '10146': {'雾里看花'},\n",
       " '10147': {'Englishonline translation', '英语在线翻译'},\n",
       " '10148': {'张爱林'},\n",
       " '10149': {'冷晖之枪（称号）', '马超'},\n",
       " '10150': {'Nidalee,the Bestial Huntress',\n",
       "  '奈德丽',\n",
       "  '奶大力',\n",
       "  '狂野女猎手',\n",
       "  '狂野女猎手·奈德丽',\n",
       "  '狂野猎人',\n",
       "  '狂野猎手',\n",
       "  '豹女'},\n",
       " '10151': {'猫和老鼠', '猫和老鼠手游', '猫和老鼠手游10'},\n",
       " '10152': {'蔡明'},\n",
       " '10153': {'毕生'},\n",
       " '10154': {'ksama', '忏悔'},\n",
       " '10155': {'你是我的唯一', '你是我的，唯一'},\n",
       " '10156': {'mouth', '嘴', '嘴里'},\n",
       " '10157': {'胡军'},\n",
       " '10158': {'“我和草原有个约定”齐峰人民大会堂独唱民族音乐会', '我和草原有个约定'},\n",
       " '10159': {'张育林'},\n",
       " '10160': {'十八般武艺'},\n",
       " '10161': {'材料力学'},\n",
       " '10162': {'张燕'},\n",
       " '10163': {'刘杰'},\n",
       " '10164': {'陈涛'},\n",
       " '10165': {'先生'},\n",
       " '10166': {'带刺的玫瑰', '滴血的刺刀'},\n",
       " '10167': {'董事会主席', '董事局主席'},\n",
       " '10168': {'胡明德'},\n",
       " '10169': {'李嘉琪'},\n",
       " '10170': {'境界の彼方', '境界的彼方'},\n",
       " '10171': {'陈俊豪', '陈圣仑', '陈石'},\n",
       " '10172': {'龙江'},\n",
       " '10173': {'Imperial examination（English）',\n",
       "  '中国古代科举制度',\n",
       "  '中国古代科举考试制度',\n",
       "  '中国科举制度',\n",
       "  '古代科举制度',\n",
       "  '封建科举制度',\n",
       "  '科举',\n",
       "  '科举制',\n",
       "  '科举制度',\n",
       "  '科举考试',\n",
       "  '科举考试制度',\n",
       "  '科举选士制度'},\n",
       " '10174': {'凤凰山公园'},\n",
       " '10175': {'Robert Downey Jr',\n",
       "  'Robert Downey Jr.',\n",
       "  'Robert John Downey Jr.',\n",
       "  '唐尼',\n",
       "  '唐尼小罗伯特',\n",
       "  '唐尼罗伯特',\n",
       "  '小劳勃·道尼',\n",
       "  '小唐尼',\n",
       "  '小唐尼罗伯特',\n",
       "  '小罗伯特',\n",
       "  '小罗伯特·唐尼',\n",
       "  '小罗伯特·约翰·唐尼',\n",
       "  '小罗伯特唐尼',\n",
       "  '罗伯特·唐尼',\n",
       "  '罗伯特唐尼'},\n",
       " '10176': {'杨青'},\n",
       " '10177': {'向伟'},\n",
       " '10178': {'ACE', 'Vorarit Vaijairanai'},\n",
       " '10179': {'faker', '李相赫', '이상혁'},\n",
       " '10180': {'试探'},\n",
       " '10181': {'Marina', '玛丽娜'},\n",
       " '10182': {'张杨'},\n",
       " '10183': {'蝴蝶谷'},\n",
       " '10184': {'廉颇'},\n",
       " '10185': {'National Nature Reserve',\n",
       "  '中国国家级自然保护区',\n",
       "  '国家级自然保护区',\n",
       "  '国家自然保护区',\n",
       "  '自级'},\n",
       " '10186': {'杨帆'},\n",
       " '10187': {'hometown', '家乡'},\n",
       " '10188': {'王卫平'},\n",
       " '10189': {'谁上了我的床'},\n",
       " '10190': {'烟雨'},\n",
       " '10191': {'今日'},\n",
       " '10192': {'天平'},\n",
       " '10193': {'橘颂'},\n",
       " '10194': {'Edward Rochester', '爱德华', '罗切斯特'},\n",
       " '10195': {'追求'},\n",
       " '10196': {'李政'},\n",
       " '10197': {'Classroom', '教室'},\n",
       " '10198': {'Welfare', '福利'},\n",
       " '10199': {'主要矛盾'},\n",
       " '10200': {'肖冰'},\n",
       " '10201': {'错'},\n",
       " '10202': {'苏', '苏姓', '苏姓族考', '苏氏'},\n",
       " '10203': {'Royalty-Managed', 'rm', '特定使用范围版权模式'},\n",
       " '10204': {'王小明'},\n",
       " '10205': {'张咏梅'},\n",
       " '10206': {'乌合之众'},\n",
       " '10207': {'李磊'},\n",
       " '10208': {'将进酒'},\n",
       " '10209': {'杨三郎'},\n",
       " '10210': {'Arabian Nights', '《一千零一夜》', '一千零一夜', '阿拉伯之夜'},\n",
       " '10211': {'大力水手'},\n",
       " '10212': {'和你在一起'},\n",
       " '10213': {'Harley-Davidson',\n",
       "  'HarleyDavidson',\n",
       "  'harley davidson',\n",
       "  'harley·davidson',\n",
       "  '哈雷',\n",
       "  '哈雷-戴维森',\n",
       "  '哈雷·戴维森',\n",
       "  '哈雷戴维森'},\n",
       " '10214': {'小别离'},\n",
       " '10215': {'刘威'},\n",
       " '10216': {'虫儿飞'},\n",
       " '10217': {'本人'},\n",
       " '10218': {'刘文华'},\n",
       " '10219': {'费率'},\n",
       " '10220': {'果实'},\n",
       " '10221': {'啤酒谋杀案'},\n",
       " '10222': {'王坤'},\n",
       " '10223': {'王绍武'},\n",
       " '10224': {\"ferule；teacher's ruler for beating the students\", '戒尺'},\n",
       " '10225': {'Lukas', '保镖'},\n",
       " '10226': {'滚石'},\n",
       " '10227': {'Fried Tofu', '家常豆腐', '家豆腐'},\n",
       " '10228': {'刘斌'},\n",
       " '10229': {'Kad Linx', 'Kaili', '凯里', '凯里市'},\n",
       " '10230': {'手指'},\n",
       " '10231': {'To breathe in', '吸气'},\n",
       " '10232': {'奈美', '娜美'},\n",
       " '10233': {'TIGER & BUNNY', 'TIGER&BUNNY', '老虎与兔子', '老虎和兔子', '虎兔英雄传'},\n",
       " '10234': {'Renmin Road', '人民路'},\n",
       " '10235': {'疯狂'},\n",
       " '10236': {'清风寨'},\n",
       " '10237': {'世界经典文学名著博览·中国古典文学馆·岳飞传', '世界经典文学名著博览：岳飞传', '岳飞传'},\n",
       " '10238': {'一样的月光'},\n",
       " '10239': {'笑'},\n",
       " '10240': {'吴桂云'},\n",
       " '10241': {'蝴蝶君'},\n",
       " '10242': {'辣眼睛'},\n",
       " '10243': {'仆とジュリエットとジェットコースター', '朱丽叶'},\n",
       " '10244': {'王立军'},\n",
       " '10245': {'养子'},\n",
       " '10246': {'李峰'},\n",
       " '10247': {'polarity', '极性'},\n",
       " '10248': {'新啼笑因缘'},\n",
       " '10249': {'蓝猫'},\n",
       " '10250': {'accommodation', '迁就'},\n",
       " '10251': {'高松', '高松市'},\n",
       " '10252': {'沈清'},\n",
       " '10253': {'冯力'},\n",
       " '10254': {'张超'},\n",
       " '10255': {'Unlock', '解锁'},\n",
       " '10256': {'祝酒歌'},\n",
       " '10257': {'snow', '雪'},\n",
       " '10258': {'天之骄子'},\n",
       " '10259': {'无法长大'},\n",
       " '10260': {'张海军'},\n",
       " '10261': {'Proverbs of Love', '爱的箴言'},\n",
       " '10262': {'李萌'},\n",
       " '10263': {'陈赫', '진혁'},\n",
       " '10264': {'A big house', '好一个大家', '好大一个大家', '好大一个家', '好大一家', '好大个家'},\n",
       " '10265': {'白蛇后传'},\n",
       " '10266': {'Gareth Evans', '加雷斯·埃文斯'},\n",
       " '10267': {'张岚'},\n",
       " '10268': {'lesson', '教训'},\n",
       " '10269': {'刘楠'},\n",
       " '10270': {'中餐厅'},\n",
       " '10271': {'马驰'},\n",
       " '10272': {'phreatic water', '潜水'},\n",
       " '10273': {'好'},\n",
       " '10274': {'mama'},\n",
       " '10275': {'陌言川'},\n",
       " '10276': {'刘晓'},\n",
       " '10277': {'考据癖'},\n",
       " '10278': {'ZhaoGu', '赵固乡'},\n",
       " '10279': {'Hermione',\n",
       "  'Hermione Jane Granger/Hermione Jean Granger',\n",
       "  '万事通',\n",
       "  '十全十美小姐',\n",
       "  '妙丽',\n",
       "  '妙丽大板牙',\n",
       "  '格兰杰',\n",
       "  '赫——米——翁',\n",
       "  '赫—米—恩',\n",
       "  '赫敏',\n",
       "  '赫敏.格兰杰',\n",
       "  '赫敏·格兰杰',\n",
       "  '赫敏·简·格兰杰',\n",
       "  '赫敏格兰杰',\n",
       "  '赫米',\n",
       "  '问题多小姐'},\n",
       " '10280': {'盐田村'},\n",
       " '10281': {'Michael Faraday',\n",
       "  '法拉第',\n",
       "  '迈克尔',\n",
       "  '迈克尔-法拉第',\n",
       "  '迈克尔.法拉第',\n",
       "  '迈克尔·法拉第',\n",
       "  '麦可·法拉第'},\n",
       " '10282': {'I Will Wait for You', '年年有今日'},\n",
       " '10283': {'李冰'},\n",
       " '10284': {'李阳'},\n",
       " '10285': {'Black Coal', 'Thin Ice', '白日火焰', '白日焰火', '白焰'},\n",
       " '10286': {'扬州日报'},\n",
       " '10287': {'Leviathan', '巨兽'},\n",
       " '10288': {'交叉点'},\n",
       " '10289': {'张巍'},\n",
       " '10290': {'Frozen', '冻结'},\n",
       " '10291': {'杨超'},\n",
       " '10292': {'不离不弃'},\n",
       " '10293': {'yeshilun', '轮毂灯'},\n",
       " '10294': {'叶波'},\n",
       " '10295': {'郭有栋'},\n",
       " '10296': {'Lovelace', '拉芙蕾丝'},\n",
       " '10297': {'管理学'},\n",
       " '10298': {'雷震'},\n",
       " '10299': {'张应华'},\n",
       " '10300': {'William Liu', '刘伟'},\n",
       " '10301': {'无羁'},\n",
       " '10302': {'童童', '许云龙'},\n",
       " '10303': {'王秀丽'},\n",
       " '10304': {'王建国'},\n",
       " '10305': {'高英'},\n",
       " '10306': {'魔仙', '魔仙小公主'},\n",
       " '10307': {'小爷'},\n",
       " '10308': {'刘涛'},\n",
       " '10309': {'杨雄'},\n",
       " '10310': {'根'},\n",
       " '10311': {'发色'},\n",
       " '10312': {'佛爷', '张启山', '张大佛爷'},\n",
       " '10313': {'家破人亡'},\n",
       " '10314': {'喜欢你'},\n",
       " '10315': {'卫风.氓',\n",
       "  '卫风·氓',\n",
       "  '卫风氓',\n",
       "  '国风·卫风·氓',\n",
       "  '氓',\n",
       "  '诗·卫风·氓',\n",
       "  '诗经.氓',\n",
       "  '诗经·国风·卫风·氓',\n",
       "  '诗经·氓',\n",
       "  '诗经。氓',\n",
       "  '诗经氓'},\n",
       " '10316': {'Yu Opera', '河南梆子', '河南豫剧', '豫剧'},\n",
       " '10317': {'蓝色'},\n",
       " '10318': {'lowpeow', '老表'},\n",
       " '10319': {'刘睿'},\n",
       " '10320': {'过门'},\n",
       " '10321': {'汪汪'},\n",
       " '10322': {'陈刚'},\n",
       " '10323': {'张敏'},\n",
       " '10324': {'Kong Qingdong', '东博神相', '北大醉侠', '孔庆东'},\n",
       " '10325': {'梅家坞'},\n",
       " '10326': {'A mission', '使命', '反黑使命'},\n",
       " '10327': {'刘亚洲'},\n",
       " '10328': {'张晓辉'},\n",
       " '10329': {'夏天的味道'},\n",
       " '10330': {'CHENQUN', '陈群'},\n",
       " '10331': {'浮出水面'},\n",
       " '10332': {'《七月与安生》', '七月与安生'},\n",
       " '10333': {'张大力'},\n",
       " '10334': {'春天'},\n",
       " '10335': {'打擂'},\n",
       " '10336': {'两晋', '晋', '晋国', '晋朝'},\n",
       " '10337': {'Guide', '指引'},\n",
       " '10338': {'书法'},\n",
       " '10339': {'Scud', '“飞毛腿”导弹', '飞毛腿', '飞毛腿导弹'},\n",
       " '10340': {'马骏'},\n",
       " '10341': {'陈军'},\n",
       " '10342': {'现身'},\n",
       " '10343': {'类似爱情'},\n",
       " '10344': {'陈良'},\n",
       " '10345': {'跳跃小子'},\n",
       " '10346': {'黄震'},\n",
       " '10347': {'河间府衙'},\n",
       " '10348': {'沃尔克'},\n",
       " '10349': {'杨羽'},\n",
       " '10350': {'psychiatric disorders', '精神失常'},\n",
       " '10351': {'我要的幸福'},\n",
       " '10352': {'石破天惊'},\n",
       " '10353': {'张瑜'},\n",
       " '10354': {'录音机'},\n",
       " '10355': {'后天'},\n",
       " '10356': {'贾凤山'},\n",
       " '10357': {'我怎能离开你'},\n",
       " '10358': {'背叛'},\n",
       " '10359': {'Friends with Benefits', '朋友也上床', '朋友也上床/炮友关系', '炮友', '炮友关系'},\n",
       " '10360': {'李永清'},\n",
       " '10361': {'你总有爱我的一天'},\n",
       " '10362': {'王道'},\n",
       " '10363': {'许利民'},\n",
       " '10364': {'MR.U', '悠客'},\n",
       " '10365': {'李青'},\n",
       " '10366': {'Refugee', '难民'},\n",
       " '10367': {'嘉峪关'},\n",
       " '10368': {'陈少雄'},\n",
       " '10369': {'柳毅传书'},\n",
       " '10370': {'entrepreneur', '创业者'},\n",
       " '10371': {'树种'},\n",
       " '10372': {'Ma Yun', '马云'},\n",
       " '10373': {'陈志刚'},\n",
       " '10374': {'般若波罗蜜多心经'},\n",
       " '10375': {'Dribblebreakthrough', '突破'},\n",
       " '10376': {'张剑峰'},\n",
       " '10377': {'Dancer', '舞者'},\n",
       " '10378': {'MENGMA', '猛犸'},\n",
       " '10379': {'陈光'},\n",
       " '10380': {'胡建军'},\n",
       " '10381': {'嘉兴交通局', '嘉兴市交通局', '嘉兴市交通运输局'},\n",
       " '10382': {'张晶'},\n",
       " '10383': {'萧清'},\n",
       " '10384': {'天梯', '天梯\\u3000'},\n",
       " '10385': {'笙'},\n",
       " '10386': {'center of gravity（重心）；core（核心）', '重心'},\n",
       " '10387': {'新华镇'},\n",
       " '10388': {'小Q'},\n",
       " '10389': {'阅江楼记'},\n",
       " '10390': {'海月姫', '海月姬', '海月姬 ～水母公主～', '海月姬-くらげひめ-'},\n",
       " '10391': {'王俊峰'},\n",
       " '10392': {'隔墙有耳'},\n",
       " '10393': {'Cholesterol', 'cholesterol', '胆固醇', '胆甾醇'},\n",
       " '10394': {'陈萍'},\n",
       " '10395': {'徐旭'},\n",
       " '10396': {'喜来乐'},\n",
       " '10397': {'Nang Tard/นางทาส', '女奴'},\n",
       " '10398': {'陈亮'},\n",
       " '10399': {'村长'},\n",
       " '10400': {'Custody', '监护权'},\n",
       " '10401': {'杀不死'},\n",
       " '10402': {'迎燕'},\n",
       " '10403': {'张宏'},\n",
       " '10404': {'辛平'},\n",
       " '10405': {'鸡蛋炒面'},\n",
       " '10406': {'吴捷'},\n",
       " '10407': {'修复'},\n",
       " '10408': {'骊姬'},\n",
       " '10409': {'王春燕'},\n",
       " '10410': {'燕妮', '燕妮·马克思'},\n",
       " '10411': {'终于'},\n",
       " '10412': {'逍遥游'},\n",
       " '10413': {'SHUA LAI', '耍赖'},\n",
       " '10414': {'张晓丽'},\n",
       " '10415': {'Snowman Computer Assisted Translation', '雪人', '雪人CAT'},\n",
       " '10416': {'杨霞'},\n",
       " '10417': {'张爱群'},\n",
       " '10418': {'Калининград', '加力宁格勒', '加里宁格勒', '哥尼斯堡'},\n",
       " '10419': {'王伟'},\n",
       " '10420': {'王正华'},\n",
       " '10421': {'刘英'},\n",
       " '10422': {'原配'},\n",
       " '10423': {'pine nut', '松子'},\n",
       " '10424': {'杨开明'},\n",
       " '10425': {'Game For Peace', '和平精英'},\n",
       " '10426': {'bbc'},\n",
       " '10427': {'打更人'},\n",
       " '10428': {'强国'},\n",
       " '10429': {'高晓东'},\n",
       " '10430': {'The Prison', '叛狱无问', '叛狱无间', '监狱', '프리즌'},\n",
       " '10431': {'张瑜'},\n",
       " '10432': {'杨军'},\n",
       " '10433': {'长发'},\n",
       " '10434': {'take chances', '赌博'},\n",
       " '10435': {'cast aside neglect', '撇'},\n",
       " '10436': {'王建平'},\n",
       " '10437': {'云巅之上'},\n",
       " '10438': {'失踪的上清寺'},\n",
       " '10439': {'陈远平'},\n",
       " '10440': {'No Breathing', 'nobreathing', '屏息', '노브레싱 /No Breathing'},\n",
       " '10441': {'李明'},\n",
       " '10442': {'管理学'},\n",
       " '10443': {'01', 'Zero'},\n",
       " '10444': {'王保军'},\n",
       " '10445': {'万花筒'},\n",
       " '10446': {'陈彬'},\n",
       " '10447': {'陈鸿宇'},\n",
       " '10448': {'孙晨'},\n",
       " '10449': {'张睿'},\n",
       " '10450': {'李琦'},\n",
       " '10451': {'孙新强'},\n",
       " '10452': {'博利绍伊乌苏里斯基岛',\n",
       "  '大乌苏里岛',\n",
       "  '大乌苏里斯基岛',\n",
       "  '抚远三角洲',\n",
       "  '抚远黑瞎子岛',\n",
       "  '摩林乌珠岛',\n",
       "  '英语Heixiazi Island 俄语островов Большой Уссурийский',\n",
       "  '黑瞎子',\n",
       "  '黑瞎子岛'},\n",
       " '10453': {'尚志'},\n",
       " '10454': {'张延'},\n",
       " '10455': {'真田幸村'},\n",
       " '10456': {'sunny', '张琨'},\n",
       " '10457': {'Ignorant', '懵懂'},\n",
       " '10458': {'五音'},\n",
       " '10459': {'クリストファー・コロンブス,Christopher Columbus', '克里斯托弗·哥伦布'},\n",
       " '10460': {'nutrient', 'nutritive substance', '养分'},\n",
       " '10461': {'人间(中)复活夜', '人间中卷复活夜'},\n",
       " '10462': {'我在雨中等你'},\n",
       " '10463': {'大型水景秀《红楼梦》', '红楼梦'},\n",
       " '10464': {'Toni Morrison', 'Toni Morrlson', '托妮·莫瑞森', '托妮·莫里森', '托尼·莫里森'},\n",
       " '10465': {'白茶', '福建白茶'},\n",
       " '10466': {'balloon cup', '热气球'},\n",
       " '10467': {'叮咯咙咚呛', '叮咯隆咚呛', '띵꺼롱뚱치앙'},\n",
       " '10468': {'张涛'},\n",
       " '10469': {'保护膜', '膜'},\n",
       " '10470': {'lancy', '梁馨'},\n",
       " '10471': {'你好吗'},\n",
       " '10472': {'プール', '游泳池'},\n",
       " '10473': {'南塘镇'},\n",
       " '10474': {'辽宁吧'},\n",
       " '10475': {'root system', '根系'},\n",
       " '10476': {'打龙袍'},\n",
       " '10477': {'陈建国'},\n",
       " '10478': {'刘凯'},\n",
       " '10479': {'陈铭'},\n",
       " '10480': {'Kodoku', '蛊毒'},\n",
       " '10481': {'姬庄村'},\n",
       " '10482': {'张文斌'},\n",
       " '10483': {'李平'},\n",
       " '10484': {'麦浪'},\n",
       " '10485': {'The Hunting Party', '狩猎派对', '猎狐行动'},\n",
       " '10486': {'我们都一样'},\n",
       " '10487': {'c两优华占'},\n",
       " '10488': {'克林斯曼·昊天', '昊天'},\n",
       " '10489': {'Hydronephrosis', '积水', '肾积水'},\n",
       " '10490': {'Teenager', '少年十五二十时'},\n",
       " '10491': {'李程'},\n",
       " '10492': {'雷雨'},\n",
       " '10493': {'大中华赋'},\n",
       " '10494': {'李翔'},\n",
       " '10495': {'关河'},\n",
       " '10496': {'Chairman of the board', '董事长'},\n",
       " '10497': {'何静'},\n",
       " '10498': {'双生'},\n",
       " '10499': {'Lin Shan Country', '林山乡'},\n",
       " '10500': {'平常'},\n",
       " '10501': {'刚殿 刚爷 小领导 心机班长', '小刚'},\n",
       " '10502': {'陆洋'},\n",
       " '10503': {'十四条向日葵',\n",
       "  '向日葵',\n",
       "  '向日葵梵高',\n",
       "  '梵高向日葵',\n",
       "  '法语：Les Tournesols；荷兰语：Zonnebloemen；英语：Sunflowers',\n",
       "  '花瓶里的十二朵向日葵',\n",
       "  '花瓶里的十四朵向日葵'},\n",
       " '10504': {'Patek Philippe', 'patekphilippe', '百达斐丽', '百达翡丽', '翡丽百达'},\n",
       " '10505': {'黄维'},\n",
       " '10506': {'Carnivàle', '嘉年华'},\n",
       " '10507': {'Yu-Gi-Oh!', 'ゆうぎおう', '游☆戏☆王', '游戏王', '遊☆戲☆王'},\n",
       " '10508': {'通天神探', '黑夜天使'},\n",
       " '10509': {'机械设计基础'},\n",
       " '10510': {'伍美珍'},\n",
       " '10511': {'公平街道', '公平镇'},\n",
       " '10512': {'Shiragawa Atusi', '天门', '白川笃史'},\n",
       " '10513': {'Mortal fix true 2', '凡人修真2'},\n",
       " '10514': {'Coloured drawing or pattern', '彩绘'},\n",
       " '10515': {'Chinese visitor;Chinese tourist', '中国游客'},\n",
       " '10516': {'上阵父子兵'},\n",
       " '10517': {'食神'},\n",
       " '10518': {'水墨图', '水磨图'},\n",
       " '10519': {'中国石化燕山石化公司', '中石化燕山分公司', '燕山石化'},\n",
       " '10520': {'冯力'},\n",
       " '10521': {'理由'},\n",
       " '10522': {'张家坪村'},\n",
       " '10523': {'画师'},\n",
       " '10524': {'厚涂', '涂厚'},\n",
       " '10525': {'陈军'},\n",
       " '10526': {'程红'},\n",
       " '10527': {'牛勇'},\n",
       " '10528': {'女性尿道炎', '尿道炎', '男性尿道炎'},\n",
       " '10529': {'mama'},\n",
       " '10530': {'小李子'},\n",
       " '10531': {'临夏', '临夏县', '临夏回族自治州临夏县'},\n",
       " '10532': {'五味俱全'},\n",
       " '10533': {'NG'},\n",
       " '10534': {'姐姐', '穆霓凰', '郡主', '霓凰', '霓凰郡主', '霓虹'},\n",
       " '10535': {'保卫战'},\n",
       " '10536': {'刘凯'},\n",
       " '10537': {'李壮平'},\n",
       " '10538': {'周骏超'},\n",
       " '10539': {'周媛'},\n",
       " '10540': {'王志远'},\n",
       " '10541': {'九街十八巷'},\n",
       " '10542': {'虎啸龙吟'},\n",
       " '10543': {'陈斌'},\n",
       " '10544': {'夏有乔木雅望天堂'},\n",
       " '10545': {'城隍庙', '武功城隍庙'},\n",
       " '10546': {'Man-Thing', '类人体'},\n",
       " '10547': {'水墨人生'},\n",
       " '10548': {'爱你一生'},\n",
       " '10549': {'妈妈的爱'},\n",
       " '10550': {'杨洁'},\n",
       " '10551': {'一带一路'},\n",
       " '10552': {'无名氏'},\n",
       " '10553': {'温八叉', '温岐', '温庭筠', '温庭钧', '温飞卿', '飞卿'},\n",
       " '10554': {'Bizalom', '信任'},\n",
       " '10555': {'不忘初心方得始终'},\n",
       " '10556': {'惊蛰', '谍战深海之惊蛰', '麻雀2之惊蛰'},\n",
       " '10557': {'《情归何处》', '情归何处'},\n",
       " '10558': {'Wild goose', '孤雁'},\n",
       " '10559': {'奇点', '魏兄', '魏总', '魏渭'},\n",
       " '10560': {'apec千手观音', '《Thousand-Hand Kwan-yin》', '《千手观音》', '千手观音'},\n",
       " '10561': {'张辉'},\n",
       " '10562': {'龙塘村'},\n",
       " '10563': {'王琦'},\n",
       " '10564': {'Kularb Fai', '火焰玫瑰'},\n",
       " '10565': {'琉璃'},\n",
       " '10566': {'刘凯'},\n",
       " '10567': {'One More Time'},\n",
       " '10568': {'林琳'},\n",
       " '10569': {'奎木星君', '奎木狼', '郎君', '黄袍怪'},\n",
       " '10570': {'曲海峰'},\n",
       " '10571': {'Kindergarten', '幼儿园', '幼稚园'},\n",
       " '10572': {'风雨如磐'},\n",
       " '10573': {'春江月'},\n",
       " '10574': {'青苹果'},\n",
       " '10575': {'膻味'},\n",
       " '10576': {'方一鸣'},\n",
       " '10577': {'政治经济学'},\n",
       " '10578': {'Tyrants - Fight through Time', '暴君'},\n",
       " '10579': {'陈诚'},\n",
       " '10580': {'Mancheng District', '满城', '满城区', '满城县'},\n",
       " '10581': {'童话'},\n",
       " '10582': {'胭脂', '越剧胭脂'},\n",
       " '10583': {'kalasa', '净瓶'},\n",
       " '10584': {'Jeryl', '李佩玲'},\n",
       " '10585': {'等待'},\n",
       " '10586': {'药王'},\n",
       " '10587': {'天天向上'},\n",
       " '10588': {'最高人民法院关于人民法院执行工作若干问题的规定', '最高人民法院关于人民法院执行工作若干问题的规定（试行）'},\n",
       " '10589': {'刘志'},\n",
       " '10590': {'一条小团团', '一条小团团OvO'},\n",
       " '10591': {'天福'},\n",
       " '10592': {'李凤云'},\n",
       " '10593': {'金·斯坦利·罗宾逊'},\n",
       " '10594': {'大王来绑我呀'},\n",
       " '10595': {'明道'},\n",
       " '10596': {'李飞'},\n",
       " '10597': {'王文博'},\n",
       " '10598': {'Mia Luang/เมียหลวง', '正妻'},\n",
       " '10599': {'马冰'},\n",
       " '10600': {'Vin', '张彬彬'},\n",
       " '10601': {'新倩女幽魂'},\n",
       " '10602': {'陈峰'},\n",
       " '10603': {'少年西游'},\n",
       " '10604': {'PBOC',\n",
       "  \"People's Bank of China\",\n",
       "  \"THE PEOPLE'S BANK OF CHINA\",\n",
       "  '中国中央人民银行',\n",
       "  '中国中央银行',\n",
       "  '中国人民银行',\n",
       "  '中国人行',\n",
       "  '中国央行',\n",
       "  '人民银行',\n",
       "  '人行',\n",
       "  '央行'},\n",
       " '10605': {'李强'},\n",
       " '10606': {'liyu', '李渔'},\n",
       " '10607': {'樊登读书', '樊登读书会'},\n",
       " '10608': {'李志军'},\n",
       " '10609': {'陈晓斌'},\n",
       " '10610': {'唐颖'},\n",
       " '10611': {'野百合也有春天'},\n",
       " '10612': {'盆腔炎'},\n",
       " '10613': {'神灯', '魔术神灯'},\n",
       " '10614': {'ディタ（日语）,Dita（罗马音）', '蒂塔'},\n",
       " '10615': {'make a motion picture', '拍戏', '演戏'},\n",
       " '10616': {'并州'},\n",
       " '10617': {'A dance with dragons', '与龙共舞'},\n",
       " '10618': {'Feixian', '山东费县', '费县'},\n",
       " '10619': {'Mid-cap share', '中盘股'},\n",
       " '10620': {'张丹'},\n",
       " '10621': {'考察'},\n",
       " '10622': {'玛丽'},\n",
       " '10623': {'马冬梅'},\n",
       " '10624': {'赵越'},\n",
       " '10625': {'周劲松'},\n",
       " '10626': {'王旭'},\n",
       " '10627': {'处女情缘'},\n",
       " '10628': {'时光荏苒'},\n",
       " '10629': {'姚安'},\n",
       " '10630': {'学者'},\n",
       " '10631': {'张杰'},\n",
       " '10632': {'宋华'},\n",
       " '10633': {'东寨村'},\n",
       " '10634': {'祝愿'},\n",
       " '10635': {'公共关系管理'},\n",
       " '10636': {'张辉'},\n",
       " '10637': {'Captain', '船长'},\n",
       " '10638': {'张青'},\n",
       " '10639': {'罗晓东'},\n",
       " '10640': {'一顾倾城'},\n",
       " '10641': {'只对你有感觉'},\n",
       " '10642': {'Terence Chang', '张家振'},\n",
       " '10643': {'main account', '大号'},\n",
       " '10644': {'周宁'},\n",
       " '10645': {'爱情'},\n",
       " '10646': {'刘文雄'},\n",
       " '10647': {'Willie Stevenson（William Stevenson）', '史蒂文森', '威利·史蒂文森'},\n",
       " '10648': {'李芊墨', '笑楚'},\n",
       " '10649': {'天灯'},\n",
       " '10650': {'80s', '80后', '八零后'},\n",
       " '10651': {'傻瓜'},\n",
       " '10652': {'新产品', '新品'},\n",
       " '10653': {'Ming dynasty Navy', '大明水师', '明朝水师'},\n",
       " '10654': {'丽萍', '何丽萍'},\n",
       " '10655': {'廖佳琳'},\n",
       " '10656': {'赵静'},\n",
       " '10657': {'Janice', '吴倩'},\n",
       " '10658': {'朵朵'},\n",
       " '10659': {'Life planning', '人生规划'},\n",
       " '10660': {'Uncle', '大叔'},\n",
       " '10661': {'异世贼王'},\n",
       " '10662': {'王明荣'},\n",
       " '10663': {'新能源'},\n",
       " '10664': {'马如龙'},\n",
       " '10665': {'Wuzhong District', '吴中', '吴中区', '吴县', '吴县市', '苏州吴中', '苏州吴中区'},\n",
       " '10666': {'拉达'},\n",
       " '10667': {'张明辉'},\n",
       " '10668': {'绝世唐门'},\n",
       " '10669': {'韩启德'},\n",
       " '10670': {'蜻蜓'},\n",
       " '10671': {'GDP', 'gdp', 'geography databaseplatform'},\n",
       " '10672': {'阴道'},\n",
       " '10673': {'ブルーバード', 'ブルーバード, Blue Bird', '青鸟'},\n",
       " '10674': {'何雁'},\n",
       " '10675': {'panglong', '荣耀'},\n",
       " '10676': {'Hexi District', '天津市河西区', '天津河西区', '河西', '河西区'},\n",
       " '10677': {'歌唱'},\n",
       " '10678': {'All in', '全部'},\n",
       " '10679': {'工作'},\n",
       " '10680': {'Riders', '《极限盗党》', '末路狂澜', '末路狂澜Riders(2002)', '极速骑手', '极限盗党'},\n",
       " '10681': {'兽'},\n",
       " '10682': {'Country love romance', '乡村爱情8', '乡村爱情浪漫曲'},\n",
       " '10683': {'Alumnus; old boy; alumnus（男）; alumna（女）; school fellow', '校友'},\n",
       " '10684': {'儿童肺炎', '小儿肺炎', '肺炎'},\n",
       " '10685': {'钢琴曲'},\n",
       " '10686': {'箭矢'},\n",
       " '10687': {'Starstorms', '星际风暴'},\n",
       " '10688': {'鬼书'},\n",
       " '10689': {'一对一'},\n",
       " '10690': {'李杰'},\n",
       " '10691': {'马超'},\n",
       " '10692': {'女村长'},\n",
       " '10693': {'The younger generation', '年轻的一代', '年青一代', '年青的一代'},\n",
       " '10694': {'一个人'},\n",
       " '10695': {'中华人民共和国史'},\n",
       " '10696': {'本色'},\n",
       " '10697': {'彭巧茵'},\n",
       " '10698': {'刘波'},\n",
       " '10699': {'PASIO', '热情'},\n",
       " '10700': {'羊角村'},\n",
       " '10701': {'四废星君', '袁洪'},\n",
       " '10702': {'王学斌'},\n",
       " '10703': {'李俊'},\n",
       " '10704': {'吕四娘'},\n",
       " '10705': {'母系社会'},\n",
       " '10706': {'Primeval', '远古入侵'},\n",
       " '10707': {'犬王'},\n",
       " '10708': {'李永红'},\n",
       " '10709': {'王明清'},\n",
       " '10710': {'affective state', '情感状态', '感情状况'},\n",
       " '10711': {'spring thunder', '春雷'},\n",
       " '10712': {'剑走偏锋'},\n",
       " '10713': {'爱上最熟悉的陌生人'},\n",
       " '10714': {'无', '黎明之战'},\n",
       " '10715': {'Happiness', '幸运是我'},\n",
       " '10716': {'旋转', '빙글빙글'},\n",
       " '10717': {'兄弟'},\n",
       " '10718': {'韦嘉'},\n",
       " '10719': {'我的夫君僵尸大人'},\n",
       " '10720': {'李果'},\n",
       " '10721': {'马顿'},\n",
       " '10722': {'祝娅'},\n",
       " '10723': {'Big sister', '大姐', '曲剧'},\n",
       " '10724': {'汉武帝'},\n",
       " '10725': {'心愿'},\n",
       " '10726': {'A Day', '一天', '하루'},\n",
       " '10727': {'王勇'},\n",
       " '10728': {'you', '你'},\n",
       " '10729': {'张新艳'},\n",
       " '10730': {'夹子上的奶酪', '奶酪陷阱', '捕鼠器里的奶酪', '치즈인더트랩 / Cheese in the trap'},\n",
       " '10731': {'江北街道'},\n",
       " '10732': {'杨婷'},\n",
       " '10733': {'penis', '男性外生殖器', '阳具', '阴茎', '鸡巴'},\n",
       " '10734': {'堡垒', '译林世界文学名著：堡垒'},\n",
       " '10735': {'张菁'},\n",
       " '10736': {'爱恨情歌'},\n",
       " '10737': {'盖博'},\n",
       " '10738': {'习惯说'},\n",
       " '10739': {'Fighter', 'dnf格斗家', '女格斗家', '格斗家'},\n",
       " '10740': {'石牌村'},\n",
       " '10741': {'Blockchain', '区块链'},\n",
       " '10742': {'何滨'},\n",
       " '10743': {'SITP',\n",
       "  'Shanghai Institute of Technical Physics of the Chinese Academy of Sciences',\n",
       "  '上海技术物理所',\n",
       "  '上海技物所',\n",
       "  '上海物理研究所',\n",
       "  '中国科学院上海技术物理研究所',\n",
       "  '中国科学院上海物理研究所',\n",
       "  '中国科学院技术物理研究所',\n",
       "  '中科院上海技术物理研究所',\n",
       "  '中科院上海物理技术研究所',\n",
       "  '技术物理所',\n",
       "  '技物所'},\n",
       " '10744': {'永安'},\n",
       " '10745': {'吴琼'},\n",
       " '10746': {'神继'},\n",
       " '10747': {'凡人修仙传'},\n",
       " '10748': {'Tidal Wave', '海啸'},\n",
       " '10749': {'高峰'},\n",
       " '10750': {'韩秀英'},\n",
       " '10751': {'廖明'},\n",
       " '10752': {'一不小心'},\n",
       " '10753': {'周炜'},\n",
       " '10754': {'Morganite', '摩根石', '粉色绿柱石'},\n",
       " '10755': {'李哲'},\n",
       " '10756': {'静静'},\n",
       " '10757': {'刘雪松'},\n",
       " '10758': {'杜一飞'},\n",
       " '10759': {'Raphanus sativus', '青萝卜'},\n",
       " '10760': {'朵儿'},\n",
       " '10761': {'陷阱'},\n",
       " '10762': {'小妖'},\n",
       " '10763': {'黄牛'},\n",
       " '10764': {'刘卫东'},\n",
       " '10765': {'影舞者'},\n",
       " '10766': {'Tobias Weis', '托比雅斯·韦斯', '托比雅斯·韦斯概述', '韦斯'},\n",
       " '10767': {'เกมรักเอาคืน', '美丽的谎言'},\n",
       " '10768': {'汉', '汉水', '汉江'},\n",
       " '10769': {'heiannkyou', 'へいあんきょう／たいらのみやこ', '平安京'},\n",
       " '10770': {'百年孤独'},\n",
       " '10771': {'金玉姬'},\n",
       " '10772': {'黄霖'},\n",
       " '10773': {'不离不弃'},\n",
       " '10774': {'凤穿牡丹'},\n",
       " '10775': {'空气'},\n",
       " '10776': {'屈辱'},\n",
       " '10777': {'李海龙'},\n",
       " '10778': {'宋华'},\n",
       " '10779': {'Gyps', '兀鹫', '兀鹫属'},\n",
       " '10780': {'ROBAM', '杭州老板电器股份有限公司', '老板电器', '老板集团'},\n",
       " '10781': {'女司机'},\n",
       " '10782': {'Close Encounter of Mahjong', '局中局', '麻局'},\n",
       " '10783': {'Advantages and disadvantages', '利弊'},\n",
       " '10784': {'李广'},\n",
       " '10785': {'张淑敏'},\n",
       " '10786': {'王威'},\n",
       " '10787': {'云牙', '花千骨云牙'},\n",
       " '10788': {'Beluga', '白鲸'},\n",
       " '10789': {'汽油表', '油表'},\n",
       " '10790': {'しろ,Shiro', '白'},\n",
       " '10791': {'张红霞'},\n",
       " '10792': {'赵雪'},\n",
       " '10793': {'罗军'},\n",
       " '10794': {'母种'},\n",
       " '10795': {'Geotrupidae', '蜣螂'},\n",
       " '10796': {'王磊'},\n",
       " '10797': {'李东升'},\n",
       " '10798': {'张英'},\n",
       " '10799': {'王春华'},\n",
       " '10800': {'爆裂飞车'},\n",
       " '10801': {'刀光枪影', '大道天行'},\n",
       " '10802': {'与世隔绝'},\n",
       " '10803': {'林静'},\n",
       " '10804': {'Lupin the Third,ルパン三世,Rupan Sansei', '雷朋三世（港版译名）', '鲁邦', '鲁邦三世'},\n",
       " '10805': {'嫁衣'},\n",
       " '10806': {'Turbo Grape', '葡萄战宝', '葡萄甜甜龘（第4季）'},\n",
       " '10807': {'氧气'},\n",
       " '10808': {'勇闯天涯'},\n",
       " '10809': {'Cookies', '饼干'},\n",
       " '10810': {'假释'},\n",
       " '10811': {'南阳白河', '白河'},\n",
       " '10812': {'Asiatic Black Bear',\n",
       "  'Himalayan Black Bear',\n",
       "  'Ursus thibetanus',\n",
       "  '亚洲黑熊',\n",
       "  '月牙熊',\n",
       "  '黑熊',\n",
       "  '黑瞎子'},\n",
       " '10813': {'criminal', '刑事'},\n",
       " '10814': {'Criminal', '犯罪分子', '犯罪分子Criminal(1999)'},\n",
       " '10815': {'针尖上的天使'},\n",
       " '10816': {'太平天国'},\n",
       " '10817': {'琵琶曲'},\n",
       " '10818': {'冬梅'},\n",
       " '10819': {'退回'},\n",
       " '10820': {'王征'},\n",
       " '10821': {'夏雨'},\n",
       " '10822': {'Oboro Muramasa：The Demon Blade', '胧村正', '胧村正妖刀传', '胧村正：妖刀传'},\n",
       " '10823': {'杨宝森'},\n",
       " '10824': {'王东明'},\n",
       " '10825': {'Teela', '蒂拉'},\n",
       " '10826': {'朱光明'},\n",
       " '10827': {'父皇', '皇上', '魏帝', '魏帝（元姓）'},\n",
       " '10828': {'情深意长'},\n",
       " '10829': {'徐群'},\n",
       " '10830': {'武魂王座'},\n",
       " '10831': {'龙泉'},\n",
       " '10832': {'torrential rain', '暴雨'},\n",
       " '10833': {'王寅'},\n",
       " '10834': {'陈周'},\n",
       " '10835': {'汪明'},\n",
       " '10836': {'Haidong Hao', '郝海东'},\n",
       " '10837': {'刘宁'},\n",
       " '10838': {'Reborn: Journals and Notebooks', '重生'},\n",
       " '10839': {'李琦'},\n",
       " '10840': {'勇气', '勇气圣歌'},\n",
       " '10841': {'李祺'},\n",
       " '10842': {'馨儿'},\n",
       " '10843': {'线上漫画', '网络漫画'},\n",
       " '10844': {'李帅'},\n",
       " '10845': {'王建国'},\n",
       " '10846': {'张力'},\n",
       " '10847': {'火柴人联盟'},\n",
       " '10848': {'The Fifth Column',\n",
       "  '海明威全集：第五纵队西班牙大地',\n",
       "  '第五纵队',\n",
       "  '第五纵队-西班牙大地',\n",
       "  '第五纵队·西班牙大地',\n",
       "  '第五纵队与49个故事',\n",
       "  '第五纵队及其他',\n",
       "  '第五纵队西班牙在大地',\n",
       "  '第五纵队西班牙大地'},\n",
       " '10849': {'风景'},\n",
       " '10850': {'边原'},\n",
       " '10851': {'黄建国'},\n",
       " '10852': {'刘志山'},\n",
       " '10853': {'刘志刚'},\n",
       " '10854': {'兵马俑'},\n",
       " '10855': {'张屏'},\n",
       " '10856': {'谢光荣'},\n",
       " '10857': {'ディアボリックラヴァーズ(DIABOLIK LOVERS)', '魔鬼恋人'},\n",
       " '10858': {'百年密意'},\n",
       " '10859': {'entangled', '纠结'},\n",
       " '10860': {'Luxury,Large,Level up', '“LLL”或“3L', '高大上', '高大上档次', '高端大气上档次'},\n",
       " '10861': {'杨清明'},\n",
       " '10862': {'ローラン/Rolan', '罗兰'},\n",
       " '10863': {'人间至味是清欢'},\n",
       " '10864': {'罗军'},\n",
       " '10865': {'子宫囊肿', '子宫肿囊', '肿囊'},\n",
       " '10866': {'黄宇'},\n",
       " '10867': {'宇宙晕动病', '晕动病', '晕机', '晕机病', '晕船', '晕船病', '晕车', '晕车病', '航空晕动病'},\n",
       " '10868': {'大姨夫', '大姨父'},\n",
       " '10869': {'张龙'},\n",
       " '10870': {'尼罗河女儿'},\n",
       " '10871': {'João Moutinho', '若奥·穆蒂尼奥', '若昂·穆蒂尼奥'},\n",
       " '10872': {'马明'},\n",
       " '10873': {'魔鬼恋人', '魔鬼爱人'},\n",
       " '10874': {'奇迹'},\n",
       " '10875': {'田瑞云'},\n",
       " '10876': {'盛宴'},\n",
       " '10877': {'YU YANG', '于洋'},\n",
       " '10878': {'刘斌'},\n",
       " '10879': {'Yu Garden', '上海豫园', '豫园'},\n",
       " '10880': {'刘惠杰'},\n",
       " '10881': {'初心'},\n",
       " '10882': {'Echeveria cante Glass & Mendoza-Garcia', '广寒宫'},\n",
       " '10883': {'王一扬', '王小贱'},\n",
       " '10884': {'Maroon 5', 'maroon 5', 'maroon5', '魔力红', '魔力红乐队'},\n",
       " '10885': {'龙吟'},\n",
       " '10886': {'冰山上的来客'},\n",
       " '10887': {'山本五十六'},\n",
       " '10888': {'文青'},\n",
       " '10889': {'exercises to benefit the internal organs', '内功'},\n",
       " '10890': {'杨光'},\n",
       " '10891': {'《王者军团》', '王者军团'},\n",
       " '10892': {'灭门'},\n",
       " '10893': {'江山'},\n",
       " '10894': {'王开发'},\n",
       " '10895': {'洗衣服'},\n",
       " '10896': {'Kaavalan', '保镖'},\n",
       " '10897': {'青妖'},\n",
       " '10898': {'Good Time', '好时光'},\n",
       " '10899': {'女儿们的恋爱'},\n",
       " '10900': {'Hippopotamus amphibius', '河马'},\n",
       " '10901': {'消息'},\n",
       " '10902': {'龙山村'},\n",
       " '10903': {'叶不羞', '叶修', '叶秋'},\n",
       " '10904': {'Cao Hanwen', '曹京平', '曹汉文', '曹汉文（曹京平）', '端木蕻良'},\n",
       " '10905': {'梦中的额吉'},\n",
       " '10906': {'会计职称'},\n",
       " '10907': {'情报局'},\n",
       " '10908': {'王芳'},\n",
       " '10909': {'西钊'},\n",
       " '10910': {'赵玫'},\n",
       " '10911': {'いくら', '小犬座的星灵', '尼可拉', '普鲁'},\n",
       " '10912': {'ONE PIECE', 'ワンピース', '海贼王', '海贼王粤语版', '航海王'},\n",
       " '10913': {'diamond', 'rhombus', '菱形'},\n",
       " '10914': {'劳拉', '荆棘剑客', '荆棘剑客劳拉'},\n",
       " '10915': {'刘金花'},\n",
       " '10916': {'李磊'},\n",
       " '10917': {'前场'},\n",
       " '10918': {'将军家的小娘子'},\n",
       " '10919': {'陈浩'},\n",
       " '10920': {'春晖中学'},\n",
       " '10921': {'墨家巨子', '墨家矩子', '墨家钜子', '巨子', '矩子', '钜子'},\n",
       " '10922': {'学级'},\n",
       " '10923': {'王玲'},\n",
       " '10924': {'重庆璧山秀湖公园'},\n",
       " '10925': {\"Britain's Got Talent\", \"Britain's got talent\", '英国达人', '英国达人秀'},\n",
       " '10926': {'李胜利'},\n",
       " '10927': {'杨东升'},\n",
       " '10928': {'宋园', '肉丸'},\n",
       " '10929': {'吃'},\n",
       " '10930': {'感谢'},\n",
       " '10931': {'真是'},\n",
       " '10932': {'胜负'},\n",
       " '10933': {'邓修明'},\n",
       " '10934': {'罗娜'},\n",
       " '10935': {'county', '县'},\n",
       " '10936': {'何云'},\n",
       " '10937': {'金钥匙'},\n",
       " '10938': {'紙飛行機', '纸飞机'},\n",
       " '10939': {'万网', '中国万网', '北京万网志成科技有限公司'},\n",
       " '10940': {'刀尖圆弧', '刀尖圆弧半径'},\n",
       " '10941': {'阿斯汗'},\n",
       " '10942': {'Noragami',\n",
       "  'noragami',\n",
       "  'ノラガミ',\n",
       "  '流浪神差',\n",
       "  '良野神',\n",
       "  '诺拉神',\n",
       "  '野神良',\n",
       "  '野良神',\n",
       "  '野良野'},\n",
       " '10943': {'飞天'},\n",
       " '10944': {'Daybreak', '天亮了'},\n",
       " '10945': {'湖心亭'},\n",
       " '10946': {'油花'},\n",
       " '10947': {'KEEP RUNNING', '奔跑吧'},\n",
       " '10948': {'Honeymoon', '蜜月'},\n",
       " '10949': {'刘洪'},\n",
       " '10950': {'HCG', '试纸', '试纸验孕', '验孕试纸'},\n",
       " '10951': {'红莓花儿开'},\n",
       " '10952': {\"It's Love\", '就是爱', '这', '这就是爱', '这，就是爱'},\n",
       " '10953': {'超级小保安'},\n",
       " '10954': {'普洱'},\n",
       " '10955': {'杨忠'},\n",
       " '10956': {'丁力'},\n",
       " '10957': {'死党'},\n",
       " '10958': {'Ice beauty', '冰美人'},\n",
       " '10959': {'答疑'},\n",
       " '10960': {'李元元'},\n",
       " '10961': {'魔法棒'},\n",
       " '10962': {'WuXin：The Monster Killer', '无心法师'},\n",
       " '10963': {'张兵'},\n",
       " '10964': {'杨波'},\n",
       " '10965': {'张红星'},\n",
       " '10966': {'回家的诱惑'},\n",
       " '10967': {'刘星'},\n",
       " '10968': {'王者'},\n",
       " '10969': {'郑义'},\n",
       " '10970': {'回家'},\n",
       " '10971': {'中国平安人寿保险股份有限公司河南分公司', '中国平安河南分公司'},\n",
       " '10972': {'王国华'},\n",
       " '10973': {'东北一家人'},\n",
       " '10974': {'Walter·Dill·Scott', '斯科特', '沃尔特·迪尔·斯科特'},\n",
       " '10975': {'2018年3.15消费者权益日',\n",
       "  '315',\n",
       "  '315国际消费者权益日',\n",
       "  '3·15',\n",
       "  '3·15国际消费者权益日',\n",
       "  \"International Day for Protecting Consumers' Rights\",\n",
       "  'World Consumer Rights Day',\n",
       "  '世界消费者权益日',\n",
       "  '国际消费日',\n",
       "  '国际消费者日',\n",
       "  '国际消费者权益日',\n",
       "  '消费者日'},\n",
       " '10976': {'丰塞卡'},\n",
       " '10977': {'Carslan', '卡兰姿', '卡姿兰', '卡姿兰品牌'},\n",
       " '10978': {'贾国栋'},\n",
       " '10979': {'三轮车'},\n",
       " '10980': {'黄明'},\n",
       " '10981': {'广岛之恋'},\n",
       " '10982': {'Wait for the wind tocome', '等风来'},\n",
       " '10983': {'翡翠森林狼与羊'},\n",
       " '10984': {'吴巍'},\n",
       " '10985': {'Felix', '小飞'},\n",
       " '10986': {'南京华美美容医院', '南京华美美容医院有限公司'},\n",
       " '10987': {'不动'},\n",
       " '10988': {'仙山情缘'},\n",
       " '10989': {'蒋廷臣'},\n",
       " '10990': {'胡红霞'},\n",
       " '10991': {'厦房联合网', '厦门房产联合网', '厦门房地产联合网', '厦门联合房产网', '厦门联合网'},\n",
       " '10992': {'脱颖而出'},\n",
       " '10993': {'岳飞'},\n",
       " '10994': {'王艳玲'},\n",
       " '10995': {'龙头'},\n",
       " '10996': {'袁菲'},\n",
       " '10997': {'陈曦'},\n",
       " '10998': {'Magnolia', '玉堂春'},\n",
       " '10999': {'Miss Congeniality', '选美俏卧底'},\n",
       " '11000': {'刘仁文'},\n",
       " ...}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.to_pickle(kbid_to_entities, pickle_path/'kbid_to_entities.pkl')\n",
    "kbid_to_entities = pd.read_pickle(pickle_path/'kbid_to_entities.pkl')\n",
    "kbid_to_entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "知识库大小： 324418\n",
      "实体数量： 278668\n",
      "上位类型数量： 24\n"
     ]
    }
   ],
   "source": [
    "print('知识库大小：', len(kbid_to_entities))\n",
    "print('实体数量：', len(entity_to_kbids))\n",
    "print('上位类型数量：', len(type_to_idx))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "包含有大量KBID的实体：\n",
      "张健 181\n",
      "李明 229\n",
      "杨林 107\n",
      "王芳 146\n",
      "刘涛 120\n",
      "张明 157\n",
      "张华 158\n",
      "张宇 111\n",
      "张建军 109\n",
      "刘杰 156\n",
      "陈涛 122\n",
      "杨帆 128\n",
      "李磊 103\n",
      "刘斌 171\n",
      "李峰 136\n",
      "刘伟 206\n",
      "王建国 133\n",
      "陈刚 184\n",
      "张敏 161\n",
      "陈军 144\n",
      "王伟 259\n",
      "杨军 185\n",
      "张涛 188\n",
      "李平 175\n",
      "陈斌 103\n",
      "张辉 134\n",
      "李强 220\n",
      "王旭 112\n",
      "张杰 157\n",
      "李杰 156\n",
      "刘波 137\n",
      "李俊 146\n",
      "无 352\n",
      "王勇 261\n",
      "高峰 152\n",
      "王磊 169\n",
      "杨光 128\n",
      "杨波 168\n",
      "新华村 109\n",
      "王强 204\n",
      "李超 117\n",
      "刘健 102\n",
      "陈勇 187\n",
      "张磊 147\n",
      "张伟 279\n",
      "张勇 285\n",
      "王东 112\n",
      "李伟 253\n",
      "王毅 120\n",
      "王刚 181\n",
      "刘强 162\n",
      "陈平 119\n",
      "刘军 228\n",
      "刘敏 162\n",
      "李红 101\n",
      "王宏 135\n",
      "张平 115\n",
      "张建华 150\n",
      "王峰 140\n",
      "张军 199\n",
      "陈华 120\n",
      "张帆 105\n",
      "李健 134\n",
      "李静 134\n",
      "刘勇 223\n",
      "张强 149\n",
      "刘辉 124\n",
      "刘超 107\n",
      "王鹏 147\n",
      "王敏 164\n",
      "李涛 147\n",
      "张浩 103\n",
      "李军 274\n",
      "张毅 107\n",
      "李林 107\n",
      "王志刚 101\n",
      "刘刚 150\n",
      "张颖 101\n",
      "王琳 104\n",
      "李建华 130\n",
      "王飞 110\n",
      "万达广场 130\n",
      "李刚 187\n",
      "张宁 124\n",
      "李勇 232\n",
      "李莉 143\n",
      "王俊 124\n",
      "王涛 147\n",
      "王军 234\n",
      "刘畅 108\n",
      "吴刚 107\n",
      "王宁 103\n",
      "李华 133\n",
      "王健 203\n",
      "刘洋 140\n",
      "刘峰 123\n",
      "石磊 118\n",
      "杨斌 127\n",
      "王林 118\n",
      "黄勇 116\n",
      "王宇 104\n",
      "陈明 116\n",
      "陈敏 117\n",
      "杨明 105\n",
      "张斌 151\n",
      "刘鹏 101\n"
     ]
    }
   ],
   "source": [
    "print('包含有大量KBID的实体：')\n",
    "for k, v in entity_to_kbids.items():\n",
    "    if len(v) > 100:\n",
    "        print(k, len(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD4CAYAAAAZ1BptAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAV70lEQVR4nO3df6zdd33f8edrdhN+VMEOcVhqm9kMizZE6whXiSnThHBJnBTh/BEkJ2ixqCVrLKy06gTJkGYNqARa1dBIEDUjbhxEYrKULRYL9SwnFZpETG4IzQ9C6kvSOZcEfKlNyooGmL73x/lcenp9vr6+5zj32vc+H9LR+X7fn8/3fD8ffyO//P1xTlJVSJI0yD9Z6AFIks5choQkqZMhIUnqZEhIkjoZEpKkTssXegCn2wUXXFDr1q1b6GFI0lnl0Ucf/UFVrZpZX3QhsW7dOsbHxxd6GJJ0VknyfwbVvdwkSepkSEiSOhkSkqROhoQkqZMhIUnqZEhIkjoZEpKkToaEJKmTISFJ6rTovnF9trr74OEF2e/1l79+QfYr6ezgmYQkqZMhIUnqZEhIkjrNGhJJdiU5kuTJAW3/IUkluaCtJ8mtSSaSPJ7k0r6+25Icaq9tffW3JnmibXNrkrT6+Un2t/77k6w8PVOWJJ2qUzmTuBPYPLOYZC3wLqD/jutVwIb22gHc1vqeD+wELgcuA3b2/aV/W+s7vd30vm4CDlTVBuBAW5ckzaNZQ6KqvgocHdB0C/BhoPpqW4C7qudhYEWSi4Argf1VdbSqjgH7gc2t7byq+lpVFXAXcE3fZ+1uy7v76pKkeTLUPYkk7wG+W1V/OaNpNfB83/pkq52sPjmgDvC6qnoRoL1feJLx7EgynmR8ampqiBlJkgaZc0gkeRXwUeA/DWoeUKsh6nNSVbdX1VhVja1adcL/fU+SNKRhziT+ObAe+Mskfw2sAb6R5J/SOxNY29d3DfDCLPU1A+oA32+Xo2jvR4YYqyRpBHMOiap6oqourKp1VbWO3l/0l1bV94C9wA3tKaeNwEvtUtE+4IokK9sN6yuAfa3tR0k2tqeabgDub7vaC0w/BbWtry5Jmien8gjsPcDXgDclmUyy/STdHwCeBSaA/wr8O4CqOgp8HHikvT7WagAfAD7XtvkO8JVW/yTwriSH6D1F9cm5TU2SNKpZf7upqq6bpX1d33IBN3b02wXsGlAfBy4ZUP8bYNNs45MkvXz8xrUkqZMhIUnqZEhIkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE6GhCSpkyEhSepkSEiSOhkSkqROhoQkqdOsvwK7lNx98PBCD0GSziieSUiSOhkSkqROhoQkqZMhIUnqZEhIkjoZEpKkTrOGRJJdSY4kebKv9l+SfDvJ40n+e5IVfW03J5lI8kySK/vqm1ttIslNffX1SQ4mOZTki0nOafVz2/pEa193uiYtSTo1p3ImcSeweUZtP3BJVf0L4K+AmwGSXAxsBd7ctvlskmVJlgGfAa4CLgaua30BPgXcUlUbgGPA9lbfDhyrqjcCt7R+kqR5NGtIVNVXgaMzav+rqo631YeBNW15C7Cnqn5SVc8BE8Bl7TVRVc9W1U+BPcCWJAHeCdzXtt8NXNP3Wbvb8n3AptZfkjRPTsc9id8GvtKWVwPP97VNtlpX/bXAD/sCZ7r+jz6rtb/U+p8gyY4k40nGp6amRp6QJKlnpJBI8lHgOPCF6dKAbjVE/WSfdWKx6vaqGquqsVWrVp180JKkUzb0bzcl2Qa8G9hUVdN/eU8Ca/u6rQFeaMuD6j8AViRZ3s4W+vtPf9ZkkuXAa5hx2UuS9PIa6kwiyWbgI8B7qurHfU17ga3tyaT1wAbg68AjwIb2JNM59G5u723h8hBwbdt+G3B/32dta8vXAg/2hZEkaR7MeiaR5B7gHcAFSSaBnfSeZjoX2N/uJT9cVf+2qp5Kci/wLXqXoW6sqp+3z/kgsA9YBuyqqqfaLj4C7EnyCeAx4I5WvwP4fJIJemcQW0/DfCVJc5DF9o/zsbGxGh8fH2rbpfhT4ddf/vqFHoKkM0CSR6tqbGbdb1xLkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE6GhCSpkyEhSepkSEiSOhkSkqROhoQkqZMhIUnqZEhIkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE6GhCSp06whkWRXkiNJnuyrnZ9kf5JD7X1lqyfJrUkmkjye5NK+bba1/oeSbOurvzXJE22bW5PkZPuQJM2fUzmTuBPYPKN2E3CgqjYAB9o6wFXAhvbaAdwGvb/wgZ3A5cBlwM6+v/Rva32nt9s8yz4kSfNk1pCoqq8CR2eUtwC72/Ju4Jq++l3V8zCwIslFwJXA/qo6WlXHgP3A5tZ2XlV9raoKuGvGZw3ahyRpngx7T+J1VfUiQHu/sNVXA8/39ZtstZPVJwfUT7aPEyTZkWQ8yfjU1NSQU5IkzXS6b1xnQK2GqM9JVd1eVWNVNbZq1aq5bi5J6jBsSHy/XSqivR9p9UlgbV+/NcALs9TXDKifbB+SpHkybEjsBaafUNoG3N9Xv6E95bQReKldKtoHXJFkZbthfQWwr7X9KMnG9lTTDTM+a9A+JEnzZPlsHZLcA7wDuCDJJL2nlD4J3JtkO3AYeG/r/gBwNTAB/Bh4P0BVHU3yceCR1u9jVTV9M/wD9J6geiXwlfbiJPuQJM2TWUOiqq7raNo0oG8BN3Z8zi5g14D6OHDJgPrfDNqHJGn++I1rSVInQ0KS1MmQkCR1MiQkSZ0MCUlSJ0NCktTJkJAkdTIkJEmdDAlJUidDQpLUyZCQJHUyJCRJnQwJSVInQ0KS1MmQkCR1MiQkSZ0MCUlSJ0NCktTJkJAkdTIkJEmdRgqJJL+X5KkkTya5J8krkqxPcjDJoSRfTHJO63tuW59o7ev6PufmVn8myZV99c2tNpHkplHGKkmau6FDIslq4HeAsaq6BFgGbAU+BdxSVRuAY8D2tsl24FhVvRG4pfUjycVtuzcDm4HPJlmWZBnwGeAq4GLgutZXkjRPRr3ctBx4ZZLlwKuAF4F3Ave19t3ANW15S1untW9KklbfU1U/qarngAngsvaaqKpnq+qnwJ7WV5I0T4YOiar6LvCHwGF64fAS8Cjww6o63rpNAqvb8mrg+bbt8db/tf31Gdt01U+QZEeS8STjU1NTw05JkjTDKJebVtL7l/164FeAV9O7NDRTTW/S0TbX+onFqturaqyqxlatWjXb0CVJp2iUy02/CTxXVVNV9TPgS8BvACva5SeANcALbXkSWAvQ2l8DHO2vz9imqy5JmiejhMRhYGOSV7V7C5uAbwEPAde2PtuA+9vy3rZOa3+wqqrVt7ann9YDG4CvA48AG9rTUufQu7m9d4TxSpLmaPnsXQarqoNJ7gO+ARwHHgNuB/4nsCfJJ1rtjrbJHcDnk0zQO4PY2j7nqST30guY48CNVfVzgCQfBPbRe3JqV1U9Nex4JUlzl94/5hePsbGxGh8fH2rbuw8ePs2jOfNdf/nrF3oIks4ASR6tqrGZdb9xLUnqZEhIkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE6GhCSpkyEhSepkSEiSOhkSkqROhoQkqZMhIUnqZEhIkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE4jhUSSFUnuS/LtJE8neVuS85PsT3Kova9sfZPk1iQTSR5Pcmnf52xr/Q8l2dZXf2uSJ9o2tybJKOOVJM3NqGcSfwz8eVX9KvDrwNPATcCBqtoAHGjrAFcBG9prB3AbQJLzgZ3A5cBlwM7pYGl9dvRtt3nE8UqS5mDokEhyHvCvgTsAquqnVfVDYAuwu3XbDVzTlrcAd1XPw8CKJBcBVwL7q+poVR0D9gObW9t5VfW1qirgrr7PkiTNg1HOJN4ATAF/muSxJJ9L8mrgdVX1IkB7v7D1Xw0837f9ZKudrD45oH6CJDuSjCcZn5qaGmFKkqR+o4TEcuBS4Laqegvwd/zDpaVBBt1PqCHqJxarbq+qsaoaW7Vq1clHLUk6ZaOExCQwWVUH2/p99ELj++1SEe39SF//tX3brwFemKW+ZkBdkjRPhg6Jqvoe8HySN7XSJuBbwF5g+gmlbcD9bXkvcEN7ymkj8FK7HLUPuCLJynbD+gpgX2v7UZKN7ammG/o+S5I0D5aPuP2/B76Q5BzgWeD99ILn3iTbgcPAe1vfB4CrgQngx60vVXU0yceBR1q/j1XV0bb8AeBO4JXAV9pLkjRPRgqJqvomMDagadOAvgXc2PE5u4BdA+rjwCWjjFGSNDy/cS1J6mRISJI6GRKSpE6GhCSpkyEhSepkSEiSOhkSkqROhoQkqZMhIUnqZEhIkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE6GhCSpkyEhSepkSEiSOhkSkqROhoQkqdPIIZFkWZLHkny5ra9PcjDJoSRfTHJOq5/b1ida+7q+z7i51Z9JcmVffXOrTSS5adSxSpLm5nScSXwIeLpv/VPALVW1ATgGbG/17cCxqnojcEvrR5KLga3Am4HNwGdb8CwDPgNcBVwMXNf6SpLmyUghkWQN8FvA59p6gHcC97Uuu4Fr2vKWtk5r39T6bwH2VNVPquo5YAK4rL0mqurZqvopsKf1lSTNk1HPJD4NfBj4+7b+WuCHVXW8rU8Cq9vyauB5gNb+Uuv/i/qMbbrqJ0iyI8l4kvGpqakRpyRJmjZ0SCR5N3Ckqh7tLw/oWrO0zbV+YrHq9qoaq6qxVatWnWTUkqS5WD7Ctm8H3pPkauAVwHn0zixWJFnezhbWAC+0/pPAWmAyyXLgNcDRvvq0/m266pKkeTD0mURV3VxVa6pqHb0bzw9W1fuAh4BrW7dtwP1teW9bp7U/WFXV6lvb00/rgQ3A14FHgA3taalz2j72DjteSdLcjXIm0eUjwJ4knwAeA+5o9TuAzyeZoHcGsRWgqp5Kci/wLeA4cGNV/RwgyQeBfcAyYFdVPfUyjFeS1OG0hERV/QXwF235WXpPJs3s8/+A93Zs/wfAHwyoPwA8cDrGKEmaO79xLUnq9HJcbtJZ5O6Dhxdkv9df/voF2a+kufFMQpLUyZCQJHUyJCRJnQwJSVInQ0KS1MmQkCR1MiQkSZ0MCUlSJ0NCktTJkJAkdTIkJEmdDAlJUidDQpLUyZCQJHUyJCRJnQwJSVInQ0KS1GnokEiyNslDSZ5O8lSSD7X6+Un2JznU3le2epLcmmQiyeNJLu37rG2t/6Ek2/rqb03yRNvm1iQZZbKSpLkZ5UziOPD7VfVrwEbgxiQXAzcBB6pqA3CgrQNcBWxorx3AbdALFWAncDlwGbBzOlhanx19220eYbySpDkaOiSq6sWq+kZb/hHwNLAa2ALsbt12A9e05S3AXdXzMLAiyUXAlcD+qjpaVceA/cDm1nZeVX2tqgq4q++zJEnz4LTck0iyDngLcBB4XVW9CL0gAS5s3VYDz/dtNtlqJ6tPDqgP2v+OJONJxqempkadjiSpGTkkkvwy8GfA71bV356s64BaDVE/sVh1e1WNVdXYqlWrZhuyJOkUjRQSSX6JXkB8oaq+1Mrfb5eKaO9HWn0SWNu3+RrghVnqawbUJUnzZJSnmwLcATxdVX/U17QXmH5CaRtwf1/9hvaU00bgpXY5ah9wRZKV7Yb1FcC+1vajJBvbvm7o+yxJ0jxYPsK2bwf+DfBEkm+22n8EPgncm2Q7cBh4b2t7ALgamAB+DLwfoKqOJvk48Ejr97GqOtqWPwDcCbwS+Ep7SZLmydAhUVX/m8H3DQA2DehfwI0dn7UL2DWgPg5cMuwYJUmj8RvXkqROhoQkqZMhIUnqZEhIkjoZEpKkToaEJKnTKN+TkIZ298HDC7bv6y9//YLtWzrbeCYhSepkSEiSOhkSkqROhoQkqZMhIUnqZEhIkjoZEpKkToaEJKmTISFJ6mRISJI6GRKSpE7+dpOWnIX63Sh/M0pnI88kJEmdzviQSLI5yTNJJpLctNDjkaSl5Iy+3JRkGfAZ4F3AJPBIkr1V9a2FHZk0d/48us5GZ3RIAJcBE1X1LECSPcAWwJCQ5sD7MBrWmR4Sq4Hn+9YngctndkqyA9jRVv9vkmfmuJ8LgB8MNcKz21Kct3OeR+9biJ32LMXjDKPN+58NKp7pIZEBtTqhUHU7cPvQO0nGq2ps2O3PVktx3s55aViKc4aXZ95n+o3rSWBt3/oa4IUFGoskLTlnekg8AmxIsj7JOcBWYO8Cj0mSlowz+nJTVR1P8kFgH7AM2FVVT70Muxr6UtVZbinO2zkvDUtxzvAyzDtVJ1zilyQJOPMvN0mSFpAhIUnqtORDYrH+7EeStUkeSvJ0kqeSfKjVz0+yP8mh9r6y1ZPk1vbn8HiSSxd2BsNLsizJY0m+3NbXJznY5vzF9hAESc5t6xOtfd1CjntYSVYkuS/Jt9vxftsSOc6/1/7bfjLJPUlesdiOdZJdSY4kebKvNudjm2Rb638oyba5jGFJh0Tfz35cBVwMXJfk4oUd1WlzHPj9qvo1YCNwY5vbTcCBqtoAHGjr0Psz2NBeO4Db5n/Ip82HgKf71j8F3NLmfAzY3urbgWNV9UbgltbvbPTHwJ9X1a8Cv05v7ov6OCdZDfwOMFZVl9B7sGUri+9Y3wlsnlGb07FNcj6wk94XkS8Ddk4HyympqiX7At4G7Otbvxm4eaHH9TLN9X56v4H1DHBRq10EPNOW/wS4rq//L/qdTS9636U5ALwT+DK9L2T+AFg+85jTe2rubW15eeuXhZ7DHOd7HvDczHEvgeM8/WsM57dj92XgysV4rIF1wJPDHlvgOuBP+ur/qN9sryV9JsHgn/1YvUBjedm0U+u3AAeB11XViwDt/cLWbbH8WXwa+DDw9239tcAPq+p4W++f1y/m3Npfav3PJm8ApoA/bZfYPpfk1Szy41xV3wX+EDgMvEjv2D3K4j7W0+Z6bEc65ks9JE7pZz/OZkl+Gfgz4Her6m9P1nVA7az6s0jybuBIVT3aXx7QtU6h7WyxHLgUuK2q3gL8Hf9w+WGQxTBn2uWSLcB64FeAV9O73DLTYjrWs+ma40hzX+ohsah/9iPJL9ELiC9U1Zda+ftJLmrtFwFHWn0x/Fm8HXhPkr8G9tC75PRpYEWS6S+O9s/rF3Nu7a8Bjs7ngE+DSWCyqg629fvohcZiPs4Avwk8V1VTVfUz4EvAb7C4j/W0uR7bkY75Ug+JRfuzH0kC3AE8XVV/1Ne0F5h+umEbvXsV0/Ub2hMSG4GXpk9pzxZVdXNVramqdfSO5YNV9T7gIeDa1m3mnKf/LK5t/c+qf11W1feA55O8qZU20fsp/UV7nJvDwMYkr2r/rU/Pe9Ee6z5zPbb7gCuSrGxnYFe02qlZ6JsyC/0Crgb+CvgO8NGFHs9pnNe/ondK+Tjwzfa6mt512APAofZ+fusfek96fQd4gt5TIws+jxHm/w7gy235DcDXgQngvwHntvor2vpEa3/DQo97yLn+S2C8Hev/AaxcCscZ+M/At4Engc8D5y62Yw3cQ++ey8/onRFsH+bYAr/d5j4BvH8uY/BnOSRJnZb65SZJ0kkYEpKkToaEJKmTISFJ6mRISJI6GRKSpE6GhCSp0/8He4xz8gb5mlEAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "text_lengths = [min(1000, len(v)) for v in kbid_to_text.values()]\n",
    "sns.distplot(text_lengths, kde=False, bins=10)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "link_dict = defaultdict(list)\n",
    "\n",
    "with open(raw_path/'train.json', 'r') as f:\n",
    "    for i, line in enumerate(f):\n",
    "        temp = json.loads(line)\n",
    "        \n",
    "        for data in temp['mention_data']:\n",
    "            if not data['kb_id'].isdigit():\n",
    "                continue\n",
    "            \n",
    "            entity = data['mention']\n",
    "            kbids = entity_to_kbids[entity]\n",
    "            \n",
    "            num_negs = 0\n",
    "            for kbid in kbids:\n",
    "                if kbid == data['kb_id']:\n",
    "                    link_dict['entity'].append(entity)\n",
    "                    link_dict['offset'].append(data['offset'])\n",
    "                    link_dict['rawtext'].append(temp['text'])\n",
    "                    link_dict['kbtext'].append(kbid_to_text[kbid])\n",
    "                    link_dict['predict'].append(1)\n",
    "                else:\n",
    "                    if num_negs >= 2:\n",
    "                        continue\n",
    "                    link_dict['entity'].append(entity)\n",
    "                    link_dict['offset'].append(data['offset'])\n",
    "                    link_dict['rawtext'].append(temp['text'])\n",
    "                    link_dict['kbtext'].append(kbid_to_text[kbid])\n",
    "                    link_dict['predict'].append(0)\n",
    "                    num_negs += 1    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_link = pd.DataFrame(link_dict)\n",
    "train_link.to_csv(csv_path/'train_link.csv', index=False, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "type_dict = defaultdict(list)\n",
    "\n",
    "with open(raw_path/'train.json', 'r') as f:\n",
    "    for i, line in enumerate(f):\n",
    "        temp = json.loads(line)\n",
    "\n",
    "        for data in temp['mention_data']:\n",
    "            entity = data['mention']\n",
    "            \n",
    "            if data['kb_id'].isdigit():\n",
    "                entity_type = kbid_to_types[data['kb_id']]\n",
    "            else:\n",
    "                entity_type = data['kb_id'].split('|')\n",
    "                for i in range(len(entity_type)):\n",
    "                    entity_type[i] = entity_type[i][4:]\n",
    "            for e in entity_type:\n",
    "                type_dict['entity'].append(entity)\n",
    "                type_dict['offset'].append(data['offset'])\n",
    "                type_dict['rawtext'].append(temp['text'])\n",
    "                type_dict['type'].append(e)\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Awards',\n",
       " 'Biological',\n",
       " 'Brand',\n",
       " 'Constellation',\n",
       " 'Culture',\n",
       " 'Diagnosis&Treatment',\n",
       " 'Disease&Symptom',\n",
       " 'Education',\n",
       " 'Event',\n",
       " 'Food',\n",
       " 'Game',\n",
       " 'Law&Regulation',\n",
       " 'Location',\n",
       " 'Medicine',\n",
       " 'Natural&Geography',\n",
       " 'Organization',\n",
       " 'Other',\n",
       " 'Person',\n",
       " 'Software',\n",
       " 'Time&Calendar',\n",
       " 'Vehicle',\n",
       " 'VirtualThings',\n",
       " 'Website',\n",
       " 'Work'}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_type = pd.DataFrame(type_dict)\n",
    "train_type.to_csv(csv_path/'train_type.csv', index=False, sep='\\t')\n",
    "set(train_type['type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
